Necessary functions and libraries are imported.
library(dplyr)
library(tidyr)
library(data.table)
library(lubridate)
library(ggplot2)
library(GGally)
library(urca)
library(forecast)
library(rjson)
library(zoo)
library(caret)
library(ggcorrplot)
read_data <- function(path){
setwd(path)
data <- read.csv("ProjectRawData.csv")
data$event_date = ymd(data$event_date)
data <- data[!(is.na(data$event_date)) | !(is.na(data$product_content_id)),]
data <- data.table(data)
result <- fromJSON(file = "indir.json")
table <- data.frame()
for (i in 1:length(result)){
a <- as.data.frame(result[i])
table <- rbind(table, a)
}
table <- data.table(table[,c(2,3,1,4,5,7,6,8,10,12,13,9,11)])
table$event_date <- as.Date(table$event_date)
table <- table[event_date>="2021-05-29",]
new_table_len <- nrow(table)
current_data <- rbind(table,data)
data_corrected <- current_data[1:new_table_len,c(1:7,12,8:11,13)]
colnames(data_corrected) <- colnames(current_data)
data_deficit <- current_data[-(1:new_table_len),]
data_corrected <- data.table(rbind(data_corrected, data_deficit))
data_corrected[price<0,price:=NA]
return(data_corrected)
}
data <- read_data("/Users/ozgurcan/Archive/PDFLibrary/IE360/IE360ClassProject")
visit_count_calc <- function(data_prod){
data_prod <- data_prod[,var:=basket_count-mean(basket_count)]
train <- data_prod[data_prod$event_date>"2021-01-29"]
test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
model <- lm(visit_count~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$visit_count <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-12-10"]
test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
model <- lm(visit_count~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$visit_count <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-10-10"]
test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
model <- lm(visit_count~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$visit_count <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-08-01"]
test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
model <- lm(visit_count~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$visit_count <- round(pred)
test_train <- rbind(train,test)
data_prod$visit_count <- test_train$visit_count
data_prod$visit_count <- ifelse(data_prod$visit_count <= 0,0,data_prod$visit_count)
return(data_prod)
}
favored_count_calc <- function(data_prod){
data_prod <- data_prod[,var:=basket_count-mean(basket_count)]
train <- data_prod[data_prod$event_date>"2021-01-29"]
test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
model <- lm(favored_count~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$favored_count <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-12-10"]
test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
model <- lm(favored_count~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$favored_count <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-10-10"]
test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
model <- lm(favored_count~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$favored_count <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-08-01"]
test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
model <- lm(favored_count~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$favored_count <- round(pred)
test_train <- rbind(train,test)
data_prod$favored_count <- test_train$favored_count
data_prod$favored_count <- ifelse(data_prod$favored_count <= 0,0,data_prod$favored_count)
return(data_prod)
}
category_basket_calc <- function(data_prod){
data_prod <- data_prod[,var:=category_favored-mean(category_favored)]
train <- data_prod[data_prod$event_date>"2021-01-29"]
test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
model <- lm(category_basket~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$category_basket <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-12-10"]
test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
model <- lm(category_basket~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$category_basket <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-10-10"]
test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
model <- lm(category_basket~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$category_basket <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-08-01"]
test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
model <- lm(category_basket~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$category_basket <- round(pred)
test_train <- rbind(train,test)
data_prod$category_basket <- test_train$category_basket
return(data_prod)
}
category_brand_sold_calc <- function(data_prod){
data_prod <- data_prod[,var:=category_sold-mean(category_sold)]
train <- data_prod[data_prod$event_date>"2021-01-29"]
test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
model <- lm(category_brand_sold~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$category_brand_sold <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-12-10"]
test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
model <- lm(category_brand_sold~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$category_brand_sold <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-10-10"]
test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
model <- lm(category_brand_sold~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$category_brand_sold <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-08-01"]
test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
model <- lm(category_brand_sold~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$category_brand_sold <- round(pred)
test_train <- rbind(train,test)
data_prod$category_brand_sold <- test_train$category_brand_sold
return(data_prod)
}
ty_visits_calc <- function(data_prod){
data_prod <- data_prod[,var:=category_sold-mean(category_sold)]
train <- data_prod[data_prod$event_date>"2021-01-29"]
test <- data_prod[data_prod$event_date<="2021-01-29" & data_prod$event_date>"2020-12-10"]
model <- lm(ty_visits~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$ty_visits <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-12-10"]
test <- data_prod[data_prod$event_date<="2020-12-10" & data_prod$event_date>"2020-10-10"]
model <- lm(ty_visits~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$ty_visits <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-10-10"]
test <- data_prod[data_prod$event_date<="2020-10-10" & data_prod$event_date>"2020-08-01"]
model <- lm(ty_visits~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$ty_visits <- round(pred)
test_train <- rbind(train,test)
train <- test_train[test_train$event_date>"2020-08-01"]
test <- data_prod[data_prod$event_date<="2020-08-01" & data_prod$event_date>="2020-05-25"]
model <- lm(ty_visits~var, train)
cv <- as.data.frame(test$var)
colnames(cv) <- "var"
pred <- predict(model,cv)
test$ty_visits <- round(pred)
test_train <- rbind(train,test)
data_prod$ty_visits <- test_train$ty_visits
return(data_prod)
}
data_manip <- function(product_id, normal = T, discount = T, shift = T){
data_prod = data[data$product_content_id == product_id,]
data_prod <- visit_count_calc(data_prod)
data_prod <- favored_count_calc(data_prod)
data_prod <- category_basket_calc(data_prod)
data_prod <- category_brand_sold_calc(data_prod)
data_prod <- ty_visits_calc(data_prod)
data_prod <- data_prod %>% fill(price, .direction = "up")
data_prod <- data_prod %>% fill(price, .direction = "down")
data_prod$product_content_id <- NULL
data_prod$var <- NULL
data_prod <- arrange(data_prod, event_date)
if(discount){
data_prod[,lag_1:=shift(price)]
data_prod[,discount := (lag_1-price)]
data_prod$discount <- na.fill(data_prod$discount, 0)
}
if(normal){
preproc <- preProcess(data_prod[,c(4:12)], method=c("center", "scale"))
new_cols <- predict(preproc, data_prod[,c(4:12)])
data_prod[,c(4:12)] <- new_cols
}
data_prod$lag_1 <- NULL
return(data_prod)
}
accu=function(actual,forecast){
n=length(actual)
error=actual-forecast
mean=mean(actual)
sd=sd(actual)
CV=sd/mean
FBias=sum(error)/sum(actual)
MAPE=sum(abs(error/actual))/n
RMSE=sqrt(sum(error^2)/n)
MAD=sum(abs(error))/n
MADP=sum(abs(error))/sum(abs(actual))
WMAPE=MAD/mean
l=data.frame(n,mean,sd,CV,FBias,MAPE,RMSE,MAD,MADP,WMAPE)
return(l)
}
forecast_with_lr=function(fmla, data,forecast_data){
fitted_lm=lm(as.formula(fmla),data)
forecasted=predict(fitted_lm,forecast_data)
return(list(forecast=as.numeric(forecasted),model=fitted_lm))
}
forecast_with_arima=function(data,forecast_ahead,target_name='sold_count',
is_seasonal=F,is_stepwise=F,is_trace=T,is_approx=F, xreg1 = NULL){
command_string=sprintf('input_series=data$%s',target_name)
print(command_string)
eval(parse(text=command_string))
fitted=auto.arima(input_series,seasonal=is_seasonal,
trace=is_trace,stepwise=is_stepwise,approximation=is_approx, xreg = xreg1)
forecasted=predict(fitted,n.ahead=forecast_ahead, newxreg = tail(xreg1,forecast_ahead))$pred
return(list(forecast=forecasted,model=fitted))
}
forecast_with_arima_extended=function(data,forecast_ahead,target_name='sold_count',
is_seasonal=F,is_stepwise=F,is_trace=T,is_approx=F,
seasonality_period=NULL,fitted_model=NULL, xreg1 = NULL, decomposed = NULL){
if(is_seasonal & !is.null(seasonality_period)){
command_string=sprintf('input_series=ts(data$%s,freq=%d)',target_name,seasonality_period)
} else {
command_string=sprintf('input_series=data$%s',target_name)
}
print(command_string)
eval(parse(text=command_string))
if(!is.null(decomposed)){
input_series_decomposed=decompose(input_series,type = "additive")
random <- input_series_decomposed$random
}
if(is.null(fitted_model)){
if(!is.null(decomposed)){
fitted=auto.arima(random,seasonal=is_seasonal,
trace=is_trace,stepwise=is_stepwise,approximation=is_approx, xreg = xreg1)
}
else{
fitted=auto.arima(input_series,seasonal=is_seasonal,
trace=is_trace,stepwise=is_stepwise,approximation=is_approx, xreg = xreg1)
}
} else {
fitted=Arima(random, model=fitted_model)
}
if(is.null(xreg1)){
forecasted=predict(fitted,n.ahead=forecast_ahead)$pred
if(!is.null(decomposed)){
forecasted = forecasted + input_series_decomposed$seasonal[(length(ts) %% 7 +1) : (length(ts) %% 7 + forecast_ahead)] + tail(input_series_decomposed$trend[!is.na(input_series_decomposed$trend)],1)
}
}
else{
forecasted=predict(fitted,n.ahead=forecast_ahead, newxreg = tail(xreg1,forecast_ahead))$pred
if(!is.null(decomposed)){
forecasted = forecasted + input_series_decomposed$seasonal[(length(ts) %% 7 +1) : (length(ts) %% 7 + forecast_ahead)] + tail(input_series_decomposed$trend[!is.na(input_series_decomposed$trend)],1)
}
}
return(list(forecast=forecasted,model=fitted))
}
decomp <- function(data, frequ, approach){
if(frequ==7){
ts = ts(data$sold_count, freq=frequ, start = c(2020,01,25))
decomposed <- decompose(ts, type = "additive")
plot(decomposed)
print(summary(ur.kpss(decomposed$random)))
return(decomposed$random)
}
else if(frequ == 52){
data[,week:=week(event_date)]
data[,year:=year(event_date)]
weekly_data <- data[,list(mean_sales=mean(sold_count)), by=list(week, year)]
weekly_data[, year_week := paste0(year, "-",week)]
ts = ts(weekly_data$mean_sales, freq=frequ, start = c(2020,21))
decomposed <- decompose(ts, type = "additive")
plot(decomposed)
ur.kpss(decomposed$random)
return(decomposed$random)
}
else if(frequ == 12){
data[,month:=month(event_date)]
data[,year:=year(event_date)]
monthly_data <- data[,list(mean_sales=mean(sold_count)), by=list(month, year)]
monthly_data[, year_month := paste0(year, "-",month)]
ts = ts(monthly_data$mean_sales, freq=frequ, start = c(2020,05))
decomposed <- decompose(ts, type = "additive")
plot(decomposed)
ur.kpss(decomposed$random)
return(decomposed$random)
}
}
cor_plot <- function(data, product_name){
head(data)
corr <- round(cor(data[,c(2:13)]), 2)
ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title=paste0("Correlogram of ", product_name),
ggtheme=theme_bw)
}
forecast_func <- function(data,ar_order, reg_matrix){
test <- data[(length(data)-6):length(data)]
results=vector('list',7)
for(i in 1:7){
past_data=data[1:(length(data)-9+i)]
ts = ts(past_data, freq=7, start = c(2020,01,25))
decomposed <- decompose(ts, type = "additive")
reg_matrix_past <- reg_matrix[1:(length(data)-9+i),1:ncol(reg_matrix)]
reg_matrix_forecast = t(reg_matrix[length(data)-7+i, 1:ncol(reg_matrix)])
forecast_data=data.table(data[length(data)-7+i])
model <- arima(past_data, order = ar_order, xreg = reg_matrix_past)
forecasted=predict(model, n.ahead = 2, newxreg = reg_matrix_forecast)$pred[2] + decomposed$seasonal[i] + tail(decomposed$trend[!is.na(decomposed$trend)],1)
forecast_data[,prediction:=forecasted]
results[[i]]=forecast_data
}
results <- rbindlist(results)
colnames(results) <- c("Actual", "Prediction")
return(results)
}
plotting <- function(data, product_name, col, num = 1){
if(num == 1){
ggplot(data) + geom_line(aes(x = event_date, y = sold_count), color = col) +
labs(x="Date", y="The Number of Sold", title = paste0("Sales(", product_name,")")) +
theme_minimal()
}
else{
ggplot(data) + geom_line(aes(x = event_date, y = Actual, color = "Actual")) +
geom_line(aes(x=event_date, y = Prediction, color = "Prediction" ))+
labs(x="Date", y="Sales", title = paste0("Comparison of Sales and Predictions for ", product_name)) +
theme_minimal()
}
}
accu=function(actual,forecast){
n=length(actual)
error=actual-forecast
mean=mean(actual)
sd=sd(actual)
CV=sd/mean
FBias=sum(error)/sum(actual)
MAPE=sum(abs(error/actual))/n
RMSE=sqrt(sum(error^2)/n)
MAD=sum(abs(error))/n
MADP=sum(abs(error))/sum(abs(actual))
WMAPE=MAD/mean
l=data.frame(n,mean,sd,CV,FBias,MAPE,RMSE,MAD,MADP,WMAPE)
return(l)
}
The data is imported.
data_85004 <- data_manip("85004", normal = F)
head(data_85004)
## event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 79.34083 36 2275 233 450
## 2: 2020-05-26 78.52238 42 2570 282 509
## 3: 2020-05-27 79.81943 35 2245 228 444
## 4: 2020-05-28 80.95862 29 1884 168 371
## 5: 2020-05-29 80.54769 39 1848 162 364
## 6: 2020-05-30 80.28000 38 1878 167 370
## category_sold category_visits category_basket category_favored
## 1: 260 4039 149496 16032
## 2: 303 4258 151516 16397
## 3: 246 3733 142893 14839
## 4: 311 3941 146496 15490
## 5: 342 4058 146081 15415
## 6: 355 4333 147819 15729
## category_brand_sold ty_visits discount
## 1: 24614 98583816 0.0000000
## 2: 25993 101280102 0.8184524
## 3: 24165 97705955 -1.2970476
## 4: 26250 101781737 -1.1391921
## 5: 27244 103725571 0.4109284
## 6: 27661 104540727 0.2676923
Firstly, daily decomposition will be applied. When we check the project report, the variance of sales does not increase throughout the time. Therefore, type of decomposition should be additive.
decomp_85004 <- decomp(data_85004, 7)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0061
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
The results from unit root test belong to the random component of decomposition. Since its significance value is lower than the significance levels, the null hypothesis is failed to be rejected. The null hypothesis is that the data is stationary.
If the plot is analyzed, it can be seen that there is a slight upward trend, and there are some outliers, probably occurring in special days.
Weekly and monthly decomposition cannot be applied because the data do not include sales happened in 2 years. There are less than two periods. Therefore, this type of decomposition brings an error message.
Now, the autocorrelation and partial autocorrelation graphs will be analyzed.
acf(decomp_85004, na.action = na.pass)
pacf(decomp_85004, na.action = na.pass)
There is a significant autocorrelation in lag-1 and exponential decay can be seen in this graph. In partial autocorrelation graph, there is a sudden decrease after lag-1, and the other lags have negative autocorrelation. Seasonal autocorrelation is not observed in these graphs.
According to the plots above, ARIMA (1,0,0) can be applied as a baseline model.
model1 <- arima(decomp_85004, order = c(1,0,0))
summary(model1)
##
## Call:
## arima(x = decomp_85004, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.2466 -0.0300
## s.e. 0.0488 2.0809
##
## sigma^2 estimated as 967.6: log likelihood = -1908.57, aic = 3823.15
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.009354584 31.10615 19.89165 58.93116 148.3303 0.8090659
## ACF1
## Training set 0.04421066
Since lag-3 variable has a significant autocorrelation, ARIMA (3,0,0) can also be tried.
model2 <- arima(decomp_85004, order = c(3,0,0))
summary(model2)
##
## Call:
## arima(x = decomp_85004, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.2337 -0.0837 -0.3192 -0.0055
## s.e. 0.0477 0.0489 0.0476 1.2531
##
## sigma^2 estimated as 840.4: log likelihood = -1881.08, aic = 3772.16
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01427997 28.99012 18.2404 42.76788 188.251 0.7419034 -0.03548906
Lastly, lag-4 variable has also significant autocorrelation, and it is the last variable being significant. The model is below.
model3 <- arima(decomp_85004, order = c(4,0,0))
summary(model3)
##
## Call:
## arima(x = decomp_85004, order = c(4, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.1978 -0.0930 -0.2932 -0.1109 -0.0027
## s.e. 0.0501 0.0488 0.0488 0.0499 1.1213
##
## sigma^2 estimated as 829.9: log likelihood = -1878.63, aic = 3769.26
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0149663 28.80806 18.12071 32.11328 201.7454 0.7370354
## ACF1
## Training set -0.01431356
The lowest AIC value belongs to ARIMA (4,0,0) model. It can be seen that AIC value of ARIMA (3,0,0) and AIC value of ARIMA(4,0,0) are very close to each other. Adding more variables would increase the complexity, which is not intended. Therefore, this model can be chosen as the last model.
It is interesting that the sales occurred three days ago has a significant negative effect on the current sales.
In the project, most variables (basket_count, favored_count, category_sold) are not independent. Therefore, using multiple regressors on the model is not appropriate. It will arise multicollinearity problem.
Now, the correlation matrix will be observed.
cor_plot(data_85004, "Yüz Temizleyici")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
The number of sales have high correlation with category_sold, basket_count, and category_visit. Since the features are scaled differently, they should be normalized.
data_85004_norm <- data_manip("85004", normal = T)
model4 <- arima(decomp_85004, order = c(4,0,0), xreg = cbind(data_85004_norm$category_sold))
summary(model4)
##
## Call:
## arima(x = decomp_85004, order = c(4, 0, 0), xreg = cbind(data_85004_norm$category_sold))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.4353 -0.0058 -0.1970 0.2206 -0.2107
## s.e. 0.0640 0.0546 0.0537 0.0524 2.1697
## cbind(data_85004_norm$category_sold)
## 25.9015
## s.e. 2.3926
##
## sigma^2 estimated as 556.9: log likelihood = -1800.19, aic = 3614.38
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0001624377 23.59839 16.71755 -78.71831 320.5869 0.6799636
## ACF1
## Training set -0.02322312
It can be said that AIC value decreases, which is preferred.
Now, basket_count will be added, instead of category_sold.
model5 <- arima(decomp_85004, order = c(4,0,0), xreg = cbind(data_85004_norm$basket_count))
summary(model5)
##
## Call:
## arima(x = decomp_85004, order = c(4, 0, 0), xreg = cbind(data_85004_norm$basket_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.493 0.1612 -0.0578 0.2672 0.5393
## s.e. 0.049 0.0551 0.0551 0.0497 8.8115
## cbind(data_85004_norm$basket_count)
## 40.5112
## s.e. 2.3229
##
## sigma^2 estimated as 603.6: log likelihood = -1816.42, aic = 3646.84
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1744417 24.56843 16.61896 -124.7445 326.3701 0.6759538
## ACF1
## Training set -0.05429853
It has higher AIC value, compared to the model with category_sold. Therefore, it will not be chosen as final model.
Now, lastly, category_visits will be used as a regressor.
model6 <- arima(decomp_85004, order = c(4,0,0), xreg = cbind(data_85004_norm$category_visits))
summary(model6)
##
## Call:
## arima(x = decomp_85004, order = c(4, 0, 0), xreg = cbind(data_85004_norm$category_visits))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.2503 -0.0838 -0.2720 0.0932 -0.1135
## s.e. 0.0848 0.0585 0.0566 0.0864 1.2069
## cbind(data_85004_norm$category_visits)
## 18.2364
## s.e. 3.1085
##
## sigma^2 estimated as 585.5: log likelihood = -1809.99, aic = 3633.97
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01356929 24.19623 16.78329 -137.1836 378.3565 0.6826374
## ACF1
## Training set -0.004464586
This model has a lower AIC value compared to the previous model, but the lowest AIC value belongs to the first model, with the number of sold products in the same category. Therefore, model with category_sold will be used as a final model.
Now, forecasts for the last week will be made.
comparison_85004 <- forecast_func(data = data_85004$sold_count, ar_order = c(4,0,0), reg_matrix = cbind(data_85004_norm$category_sold))
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
comparison_85004 <- data.table(cbind(data_85004$event_date[(nrow(data_85004)-6):nrow(data_85004)], comparison_85004))
colnames(comparison_85004) <- c("event_date", "Actual", "Prediction")
plotting(comparison_85004,product_name = "Yüz Temizleyici", col = "",num =2)
It is obvious that the predictions are always larger than the actual sales. The reason may be the sales in January are very high and the number of sales decreases in the last month. If the seasonal term was added, the model would perform better, like in project.
The error rates are below.
accu(comparison_85004$Actual, comparison_85004$Prediction)
## n mean sd CV FBias MAPE RMSE MAD MADP
## 1 7 63.57143 13.86671 0.2181281 -1.097157 1.160731 71.59575 69.74785 1.097157
## WMAPE
## 1 1.097157
As expected, its WMAPE is very high. Our model does not perform well. The reason behind is mentioned above. Unlike the project, there is no seasonal component.
The data is imported.
data_6676673 <- data_manip("6676673", normal = F)
head(data_6676673)
## event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 137.6873 539 21850 1783 1443
## 2: 2020-05-26 134.4552 724 25401 2106 1721
## 3: 2020-05-27 136.8115 763 27380 2286 1876
## 4: 2020-05-28 138.8387 597 22004 1797 1455
## 5: 2020-05-29 136.1015 695 22675 1858 1508
## 6: 2020-05-30 134.7339 587 21608 1761 1424
## category_sold category_visits category_basket category_favored
## 1: 674 10271 319887 25365
## 2: 882 4546 223382 16909
## 3: 944 4346 198549 14733
## 4: 758 4371 207371 15506
## 5: 860 3833 180049 13112
## 6: 756 3456 176009 12758
## category_brand_sold ty_visits discount
## 1: 20525 109186825 0.000000
## 2: 22745 113831777 3.232121
## 3: 23407 115216330 -2.356326
## 4: 21422 111062671 -2.027127
## 5: 22510 113340484 2.737164
## 6: 21400 111018008 1.367595
The daily decomposition is made below.
decomp_6676673 <- decomp(data_6676673, 7)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0081
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
Since the significance value is lower than the significance levels, the null hypothesis is failed to be rejected. The null hypothesis is that the data is stationary.
When the plot is analyzed, it can be realized that there is a decreasing trend at the last days. However, in general, there is no obvious trend. The sales fluctuate a lot.
Weekly and monthly decomposition cannot be applied because the data do not include sales happened in 2 years. There are less than two periods. Therefore, this type of decomposition brings an error message.
Now, the autocorrelation and partial autocorrelation graphs will be interpretted.
acf(decomp_6676673, na.action = na.pass)
pacf(decomp_6676673, na.action = na.pass)
There is a significant autocorrelation in lag-1 and exponential decay can be seen in this graph. In partial autocorrelation graph, there is a sudden decrease after lag-1, and the other lags have negative autocorrelation. Seasonal autocorrelation is not observed in these graphs.
According to the plots above, ARIMA (1,0,0) can be applied as a baseline model.
model1 <- arima(decomp_6676673, order = c(1,0,0))
summary(model1)
##
## Call:
## arima(x = decomp_6676673, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.1424 -0.0590
## s.e. 0.0500 6.9516
##
## sigma^2 estimated as 13978: log likelihood = -2433.29, aic = 4872.58
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.00365306 118.2273 74.59765 92.65507 127.3235 0.7856261
## ACF1
## Training set 0.02545937
Since lag-2 variable has a significant autocorrelation, ARIMA (2,0,0) can also be tried.
model2 <- arima(decomp_6676673, order = c(2,0,0))
summary(model2)
##
## Call:
## arima(x = decomp_6676673, order = c(2, 0, 0))
##
## Coefficients:
## ar1 ar2 intercept
## 0.1679 -0.1779 -0.0186
## s.e. 0.0497 0.0496 5.8128
##
## sigma^2 estimated as 13533: log likelihood = -2426.97, aic = 4861.93
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01287377 116.3307 72.519 94.92378 158.3284 0.7637348 -0.06321928
Lastly, lag-3 variable has also significant autocorrelation, and it is the last variable being significant. Its partial autocorrelation is very significant. The model is below.
model3 <- arima(decomp_6676673, order = c(3,0,0))
summary(model3)
##
## Call:
## arima(x = decomp_6676673, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1048 -0.1178 -0.3554 0.0551
## s.e. 0.0471 0.0470 0.0471 4.0151
##
## sigma^2 estimated as 11809: log likelihood = -2400.4, aic = 4810.8
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.05308118 108.6706 69.25685 66.3144 218.8212 0.7293794
## ACF1
## Training set -0.03836247
The last model, ARIMA (3,0,0) has the lowest AIC value. Adding more variables would increase the complexity and the predictions cannot improve suddenly. Therefore, this model can be chosen as the last model.
In the project, most variables (basket_count, favored_count, category_sold) are not independent. Therefore, using multiple regressors on the model is not appropriate. It will arise multicollinearity problem.
Now, the correlation matrix will be observed.
cor_plot(data_6676673, "Bluetooth Kulaklık")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
The number of sales have high correlation with visit_count, basket_count, and category_sold. Since the features are scaled differently, they should be normalized. In addition, price has a strong negative correlation with the number of sales. Therefore, it can be added into the model.
data_6676673_norm <- data_manip("6676673", normal = T)
model4 <- arima(decomp_6676673, order = c(3,0,0), xreg = cbind(data_6676673_norm$category_sold, data_6676673_norm$price))
summary(model4)
##
## Call:
## arima(x = decomp_6676673, order = c(3, 0, 0), xreg = cbind(data_6676673_norm$category_sold,
## data_6676673_norm$price))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1610 -0.0920 -0.2646 -90.8411
## s.e. 0.0623 0.0543 0.0628 48.8740
## cbind(data_6676673_norm$category_sold, data_6676673_norm$price)1
## 54.6718
## s.e. 8.0607
## cbind(data_6676673_norm$category_sold, data_6676673_norm$price)2
## 0.6679
## s.e. 0.3591
##
## sigma^2 estimated as 9056: log likelihood = -2348.13, aic = 4710.26
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01565502 95.16138 65.87713 113.274 346.2983 0.6937859
## ACF1
## Training set 0.01599591
AIC value of the last model is lower than the model without the regressor. It can be said that this model is superior.
Now, basket_count will be added, instead of category_sold.
model5 <- arima(decomp_6676673, order = c(3,0,0), xreg = cbind(data_6676673_norm$basket_count, data_6676673_norm$price))
summary(model5)
##
## Call:
## arima(x = decomp_6676673, order = c(3, 0, 0), xreg = cbind(data_6676673_norm$basket_count,
## data_6676673_norm$price))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1282 -0.1078 -0.2972 -42.0739
## s.e. 0.0546 0.0510 0.0557 42.8633
## cbind(data_6676673_norm$basket_count, data_6676673_norm$price)1
## 50.8962
## s.e. 6.0049
## cbind(data_6676673_norm$basket_count, data_6676673_norm$price)2
## 0.3107
## s.e. 0.3150
##
## sigma^2 estimated as 8829: log likelihood = -2343.19, aic = 4700.38
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.07665345 93.96506 64.40542 77.25282 323.9058 0.6782865
## ACF1
## Training set 0.006499918
It has lower AIC value, compared to the model with category_sold. This model is better.
Now, lastly, visit_count will be used as a regressor.
model6 <- arima(decomp_6676673, order = c(3,0,0), xreg = cbind(data_6676673_norm$visit_count, data_6676673_norm$price))
summary(model6)
##
## Call:
## arima(x = decomp_6676673, order = c(3, 0, 0), xreg = cbind(data_6676673_norm$visit_count,
## data_6676673_norm$price))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1323 -0.1108 -0.3036 -6.6193
## s.e. 0.0532 0.0502 0.0539 42.5681
## cbind(data_6676673_norm$visit_count, data_6676673_norm$price)1
## 46.7090
## s.e. 5.6364
## cbind(data_6676673_norm$visit_count, data_6676673_norm$price)2
## 0.0491
## s.e. 0.3128
##
## sigma^2 estimated as 9127: log likelihood = -2349.7, aic = 4713.41
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.07348484 95.53383 65.73031 76.93394 313.1336 0.6922396
## ACF1
## Training set 0.002902161
This model has a higher AIC value compared to the previous model. According to the results the model with basket_count will be used as a final model.
Now, forecasts for the last week will be made.
comparison_6676673 <- forecast_func(data = data_6676673$sold_count, ar_order = c(3,0,0), reg_matrix = cbind(data_6676673_norm$basket_count, data_6676673_norm$price))
comparison_6676673 <- data.table(cbind(data_6676673$event_date[(nrow(data_6676673)-6):nrow(data_6676673)], comparison_6676673))
colnames(comparison_6676673) <- c("event_date", "Actual", "Prediction")
plotting(comparison_6676673,product_name = "Bluetooth Kulaklık", col = "",num =2)
It is obvious that the predictions are always larger than the actual sales, like the other products. If the seasonal term was added, the model would perform better, like in project. In addition, the model is successful for the catching changes. When the actual sale decreases, the prediction also decreases, and vice versa.
The error rates are below.
accu(comparison_6676673$Actual, comparison_6676673$Prediction)
## n mean sd CV FBias MAPE RMSE MAD MADP
## 1 7 362 135.8553 0.3752909 -1.663524 1.85408 605.6803 602.1955 1.663524
## WMAPE
## 1 1.663524
As expected, its WMAPE is very high. Our model does not perform well. The reason behind is mentioned above.
The data is imported.
data_32737302 <- data_manip("32737302", normal = F)
head(data_32737302)
## event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 59.99 40 3191 211 385
## 2: 2020-05-26 59.99 44 3191 211 385
## 3: 2020-05-27 59.99 33 2183 141 241
## 4: 2020-05-28 59.99 33 2313 150 259
## 5: 2020-05-29 59.99 33 2471 161 282
## 6: 2020-05-30 59.99 39 2586 169 298
## category_sold category_visits category_basket category_favored
## 1: 1365 1806 274764 9738
## 2: 1223 1700 246892 8778
## 3: 848 1164 150675 5464
## 4: 722 1052 153636 5566
## 5: 968 1374 207058 7406
## 6: 1279 1758 255544 9076
## category_brand_sold ty_visits discount
## 1: 38084 112754071 0
## 2: 35220 111701576 0
## 3: 27656 108922100 0
## 4: 25115 107988196 0
## 5: 30077 109811532 0
## 6: 36349 112116645 0
The daily decomposition is made below.
decomp_32737302 <- decomp(data_32737302, 7)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0156
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
Since people tend to buy bikini during the Spring and Summer, the number of sales in Winter and Fall is almost zero. There is an upward trend after March 2021.
Now, their autocorrelation and partial autocorrelation plots will be observed.
acf(decomp_32737302, na.action = na.pass)
pacf(decomp_32737302, na.action = na.pass)
There is a significant autocorrelation in lag-1. In partial autocorrelation graph, there is a sudden decrease after lag-1, and the other lags have negative autocorrelation. Seasonal autocorrelation is not observed in these graphs. These graphs are very similar to the other products’ autocorrelation and partial autocorrelation graphs.
According to the plots above, ARIMA (1,0,0) can be applied as a baseline model.
model1 <- arima(decomp_32737302, order = c(1,0,0))
summary(model1)
##
## Call:
## arima(x = decomp_32737302, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.1774 0.0196
## s.e. 0.0498 0.3934
##
## sigma^2 estimated as 41.2: log likelihood = -1288.31, aic = 2582.62
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002697806 6.418337 3.680862 99.52789 113.1288 0.7701051
## ACF1
## Training set 0.05434223
Since lag-2 variable has a significant autocorrelation, ARIMA (2,0,0) can also be tried.
model2 <- arima(decomp_32737302, order = c(2,0,0))
summary(model2)
##
## Call:
## arima(x = decomp_32737302, order = c(2, 0, 0))
##
## Coefficients:
## ar1 ar2 intercept
## 0.2393 -0.3194 0.0183
## s.e. 0.0483 0.0490 0.2849
##
## sigma^2 estimated as 37.15: log likelihood = -1268.12, aic = 2544.24
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.003258286 6.095244 3.51417 80.20563 140.2229 0.73523
## ACF1
## Training set -0.08382818
Lastly, lag-3 variable has also significant autocorrelation, and it is the last variable being significant. Its partial autocorrelation is very significant. The model is below.
model3 <- arima(decomp_32737302, order = c(3,0,0))
summary(model3)
##
## Call:
## arima(x = decomp_32737302, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1573 -0.2547 -0.2776 0.0100
## s.e. 0.0487 0.0485 0.0498 0.2157
##
## sigma^2 estimated as 34.41: log likelihood = -1253.16, aic = 2516.32
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.003733522 5.865789 3.336035 65.12503 138.9254 0.6979609
## ACF1
## Training set -0.01845377
The last model, ARIMA (3,0,0) has the lowest AIC value. Adding more variables would increase the complexity and the predictions cannot improve suddenly. Therefore, this model can be chosen as the last model.
The effects of the sales occurred in 2 days ago and 3 days ago are almost same.
Now, the correlation matrix will be observed.
cor_plot(data_32737302, "Bikini Üstü (2)")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
The number of sales have high correlation with visit_count, basket_count, and favored_count. Since the features are scaled differently, they should be normalized.
In general, the fashion products are favored before bought. People add some products into the favored list, then they choose whatever they want. Therefore, favored_count has a very strong correlation with the number of sales.
Now, the models will be constructed by using these features one by one.
data_32737302_norm <- data_manip("32737302", normal = T)
model4 <- arima(decomp_32737302, order = c(3,0,0), xreg = cbind(data_32737302_norm$visit_count))
summary(model4)
##
## Call:
## arima(x = decomp_32737302, order = c(3, 0, 0), xreg = cbind(data_32737302_norm$visit_count))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1508 -0.2587 -0.2743 0.0348
## s.e. 0.0491 0.0485 0.0504 0.2109
## cbind(data_32737302_norm$visit_count)
## 0.8363
## s.e. 0.2264
##
## sigma^2 estimated as 33.22: log likelihood = -1246.23, aic = 2504.46
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.005202095 5.763299 3.300996 64.64493 120.6665 0.6906299
## ACF1
## Training set -0.0132655
AIC value of the last model is slightly lower than the model without the regressor. It can be said that this model is superior.
Now, basket_count will be added, instead of visit count.
model5 <- arima(decomp_32737302, order = c(3,0,0), xreg = cbind(data_32737302_norm$basket_count))
summary(model5)
##
## Call:
## arima(x = decomp_32737302, order = c(3, 0, 0), xreg = cbind(data_32737302_norm$basket_count))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1565 -0.2513 -0.2700 0.0339
## s.e. 0.0495 0.0489 0.0507 0.2126
## cbind(data_32737302_norm$basket_count)
## 0.9305
## s.e. 0.2299
##
## sigma^2 estimated as 32.93: log likelihood = -1244.55, aic = 2501.11
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.005218935 5.738889 3.300982 65.25441 121.8915 0.690627
## ACF1
## Training set -0.01016151
It has slightly lower AIC value, compared to the model with category_sold. This model is better.
Now, lastly, favored_count will be used as a regressor.
model6 <- arima(decomp_32737302, order = c(3,0,0), xreg = cbind(data_32737302_norm$favored_count))
summary(model6)
##
## Call:
## arima(x = decomp_32737302, order = c(3, 0, 0), xreg = cbind(data_32737302_norm$favored_count))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1411 -0.2592 -0.2847 0.0272
## s.e. 0.0487 0.0483 0.0499 0.2074
## cbind(data_32737302_norm$favored_count)
## 0.8924
## s.e. 0.2261
##
## sigma^2 estimated as 33.09: log likelihood = -1245.48, aic = 2502.97
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.005183828 5.752192 3.301584 64.59097 120.3607 0.690753
## ACF1
## Training set -0.01665027
This model has a marginally higher AIC value compared to the previous model. According to the results the model with basket_count will be used as a final model.
Now, forecasts for the last week will be made.
comparison_32737302 <- forecast_func(data = data_32737302$sold_count, ar_order = c(3,0,0), reg_matrix = cbind(data_32737302_norm$basket_count))
comparison_32737302 <- data.table(cbind(data_32737302$event_date[(nrow(data_32737302)-6):nrow(data_32737302)], comparison_32737302))
colnames(comparison_32737302) <- c("event_date", "Actual", "Prediction")
plotting(comparison_32737302,product_name = "Bikini Üstü (2)", col = "",num =2)
It is obvious that the predictions are always larger than the actual sales, like the other products. In addition, the model is successful for the catching changes. When the actual sale decreases, the prediction also decreases, and vice versa.
The error rates are below.
accu(comparison_32737302$Actual, comparison_32737302$Prediction)
## n mean sd CV FBias MAPE RMSE MAD MADP
## 1 7 59.85714 17.78978 0.2972039 -1.141202 1.236991 68.74815 68.30909 1.141202
## WMAPE
## 1 1.141202
As expected, its WMAPE is very high. Our model does not perform well. The reason behind is although the random component passes the unit root test, not having sales for months affects the model notoriously.
Let’s examine the data of the “Dik Süpürge”.
data_7061886 <- data_manip("7061886", normal = F)
head(data_7061886)
## event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 238.8718 39 2471 136 236
## 2: 2020-05-26 237.8758 33 2139 117 205
## 3: 2020-05-27 240.6333 36 2541 140 243
## 4: 2020-05-28 236.8132 38 2751 152 262
## 5: 2020-05-29 232.9308 39 2227 122 213
## 6: 2020-05-30 229.2183 71 3993 223 379
## category_sold category_visits category_basket category_favored
## 1: 102 520 56048 2171
## 2: 110 508 55097 2118
## 3: 129 510 51490 1917
## 4: 140 529 54756 2099
## 5: 118 489 52369 1966
## 6: 162 608 57322 2242
## category_brand_sold ty_visits discount
## 1: 6727 105692032 0.0000000
## 2: 6956 107205276 0.9960373
## 3: 7498 110799229 -2.7575757
## 4: 7813 112879939 3.8201754
## 5: 7184 108718519 3.8823887
## 6: 8441 117041358 3.7124593
As we can see below, data set has no significant level of increasing variance over time, thus decomposition should be done in an “additive” way.
After the decomposition, let’s check the KPSS test results for stationary. The test shows that the value is way less than the significance threshold, thus we fail to reject the null hypothesis, which is the stationary of the data.
Also, if we try to decompose the data with a weekly and monthly perspective, we get an error since the dataset has no data that can support at least two periods. Therefore, we can only continue with the daily decomposition.
Furthermore, let’s analyze the ACF and PCF plots.
decomp_7061886 <- decomp(data_7061886, 7)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0068
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
As we can see, the ACF plot shows that there’s an exponential decay. Also, PCF graph shows that lag-1 is significant and starting from lag-2, almost every lag is negative, and lag-3 has the highest significance in the negative domain. There’s no seasonal features observed in these plots.
acf(decomp_7061886, na.action = na.pass)
pacf(decomp_7061886, na.action = na.pass)
According the plots above, we should use an ARIMA model with c(1, 0, 0) to observe the outputs. After that, I’ve tried (2, 0, 0), (3, 0, 0), (4, 0, 0), and (5, 0, 0) to see the difference between the models. And after the 6th order, the model has no significant change thus I’ve selected my model as (6, 0, 0).
model1 <- arima(decomp_7061886, order = c(1,0,0))
summary(model1)
##
## Call:
## arima(x = decomp_7061886, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.2049 0.0079
## s.e. 0.0493 1.2069
##
## sigma^2 estimated as 362.4: log likelihood = -1715.6, aic = 3437.2
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.00173562 19.03733 10.30929 451.9355 482.7022 0.8279748
## ACF1
## Training set 0.01321938
model2 <- arima(decomp_7061886, order = c(2,0,0))
summary(model2)
##
## Call:
## arima(x = decomp_7061886, order = c(2, 0, 0))
##
## Coefficients:
## ar1 ar2 intercept
## 0.2177 -0.0630 0.0092
## s.e. 0.0503 0.0502 1.1335
##
## sigma^2 estimated as 361: log likelihood = -1714.82, aic = 3437.63
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.001884731 18.99917 10.20014 478.1335 506.0391 0.8192085
## ACF1
## Training set -0.02144627
model3 <- arima(decomp_7061886, order = c(3,0,0))
summary(model3)
##
## Call:
## arima(x = decomp_7061886, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1966 0.0125 -0.3464 0.0014
## s.e. 0.0472 0.0482 0.0473 0.7916
##
## sigma^2 estimated as 317.4: log likelihood = -1689.74, aic = 3389.47
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.02023501 17.81587 9.553373 822.4767 882.444 0.7672645
## ACF1
## Training set -0.07570482
model4 <- arima(decomp_7061886, order = c(4,0,0))
summary(model4)
##
## Call:
## arima(x = decomp_7061886, order = c(4, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.1209 0.0158 -0.3042 -0.2178 -0.0051
## s.e. 0.0491 0.0471 0.0471 0.0492 0.6349
##
## sigma^2 estimated as 302.2: log likelihood = -1680.19, aic = 3372.38
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.03250532 17.384 9.368597 909.3851 976.1841 0.7524245 -0.02059675
model5 <- arima(decomp_7061886, order = c(5,0,0))
summary(model5)
##
## Call:
## arima(x = decomp_7061886, order = c(5, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 intercept
## 0.1006 -0.0122 -0.3022 -0.2069 -0.0918 -0.0054
## s.e. 0.0502 0.0493 0.0469 0.0494 0.0502 0.5794
##
## sigma^2 estimated as 299.6: log likelihood = -1678.53, aic = 3371.05
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.03316681 17.30968 9.374549 823.7522 910.548 0.7529025
## ACF1
## Training set -0.01268297
model6 <- arima(decomp_7061886, order = c(6,0,0))
summary(model6)
##
## Call:
## arima(x = decomp_7061886, order = c(6, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 intercept
## 0.0883 -0.0397 -0.3421 -0.2077 -0.0789 -0.1317 -0.0047
## s.e. 0.0499 0.0500 0.0490 0.0489 0.0500 0.0499 0.5079
##
## sigma^2 estimated as 294.3: log likelihood = -1675.08, aic = 3366.16
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.03286147 17.15631 9.381341 839.3245 956.9745 0.7534479
## ACF1
## Training set -0.02036132
By incrementing the order of the ARIMA model 1 at each time, we can see the change in the AIC value and after the (6, 0, 0) model, it was not efficient to increase the model complexity. Therefore, I’d like to use (6, 0, 0) in the future tasks.
cor_plot(data_7061886, "Dik Süpürge")
In the project, most of the variables (basket_count, favored_count, category_sold) are not independent. Therefore, using multiple regressors on a model for this data set is not a suitable approach. It will arise multicollinearity problem.
By looking the correlation plot, I’ve selected 3 distinct features to feed my ARIMA models and category_sold was the one that has given the smallest AIC value. Thus, I’ve selected my regressor as category_sold.
The block below tries visit_count as the regressor for the selected model in the previous task.
data_7061886_norm <- data_manip("7061886", normal = T)
model7 <- arima(decomp_7061886, order = c(6,0,0), xreg = cbind(data_7061886_norm$visit_count))
summary(model7)
##
## Call:
## arima(x = decomp_7061886, order = c(6, 0, 0), xreg = cbind(data_7061886_norm$visit_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 intercept
## 0.0479 -0.0406 -0.3322 -0.2234 -0.0725 -0.0899 -0.0471
## s.e. 0.0510 0.0518 0.0505 0.0495 0.0522 0.0548 0.4815
## cbind(data_7061886_norm$visit_count)
## 3.9560
## s.e. 0.6733
##
## sigma^2 estimated as 264.3: log likelihood = -1653.87, aic = 3325.73
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.03044494 16.25653 9.283638 461.0325 589.5924 0.7456011
## ACF1
## Training set -0.00817959
The block below tries basket_count as the regressor for the selected model in the previous task.
model8 <- arima(decomp_7061886, order = c(6,0,0), xreg = cbind(data_7061886_norm$basket_count))
summary(model8)
##
## Call:
## arima(x = decomp_7061886, order = c(6, 0, 0), xreg = cbind(data_7061886_norm$basket_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 intercept
## 0.0483 -0.0399 -0.3325 -0.2236 -0.0706 -0.0883 -0.0487
## s.e. 0.0511 0.0520 0.0505 0.0496 0.0524 0.0551 0.4819
## cbind(data_7061886_norm$basket_count)
## 4.0196
## s.e. 0.6798
##
## sigma^2 estimated as 263.6: log likelihood = -1653.33, aic = 3324.65
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.03044897 16.23425 9.244897 459.4313 581.9289 0.7424897
## ACF1
## Training set -0.007852957
The block below tries category_sold as the regressor for the selected model in the previous task.
model9 <- arima(decomp_7061886, order = c(6,0,0), xreg = cbind(data_7061886_norm$category_sold))
summary(model9)
##
## Call:
## arima(x = decomp_7061886, order = c(6, 0, 0), xreg = cbind(data_7061886_norm$category_sold))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 intercept
## 0.7002 0.2054 -0.2605 0.1736 0.0407 -0.0634 -0.1221
## s.e. 0.0675 0.0622 0.0627 0.0620 0.0645 0.0505 3.1123
## cbind(data_7061886_norm$category_sold)
## 26.0951
## s.e. 1.4255
##
## sigma^2 estimated as 161.7: log likelihood = -1557.57, aic = 3133.15
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01630843 12.71697 8.860264 1493.742 1766.186 0.7115985
## ACF1
## Training set -0.0004039257
As we can see, category_sold has given the smallest AIC, thus my predictive model is utilizing category_sold as its external regressor. And the block below creates a model with the given specifications, constructs the predictions and plot them to compare the results.
The plot shows that predictions are way more than the actual values for quite a time, buy they get closer at the end of the predictions. This may occur due to the lack of seasonality in our data, since the data set has seasonal features, but we did not create a model with a seasonality option. Let’s check the WMAPE value for gaining more insight.
comparison_7061886 <- forecast_func(data = data_7061886$sold_count, ar_order = c(6,0,0), reg_matrix = cbind(data_7061886_norm$category_sold))
comparison_7061886 <- data.table(cbind(data_7061886$event_date[(nrow(data_7061886)-6):nrow(data_7061886)], comparison_7061886))
colnames(comparison_7061886) <- c("event_date", "Actual", "Prediction")
plotting(comparison_7061886,product_name = "Dik Süpürge", col = "",num =2)
It is not a surprise for us to see that much WMAPE score, since our predictions are way off than the actual data. This also indicates that our model has functioned insufficiently and there’s much work to be done in future to improve this model.
accu(comparison_7061886$Actual, comparison_7061886$Prediction)
## n mean sd CV FBias MAPE RMSE MAD MADP
## 1 7 8.142857 3.57904 0.4395312 -1.559327 2.536658 14.52767 12.69738 1.559327
## WMAPE
## 1 1.559327
Let’s examine the data of the “Bebek Islak Mendil”.
data_4066298 <- data_manip("4066298", normal = F)
head(data_4066298)
## event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 76.05423 142 2070 307 92
## 2: 2020-05-26 75.78349 169 2154 330 99
## 3: 2020-05-27 76.92846 201 2406 399 121
## 4: 2020-05-28 77.21761 176 2191 340 103
## 5: 2020-05-29 76.98344 154 2030 296 89
## 6: 2020-05-30 77.15858 162 2016 292 88
## category_sold category_visits category_basket category_favored
## 1: 413 1149 17326 2640
## 2: 486 1369 18807 3036
## 3: 562 1562 20289 3432
## 4: 550 1482 19597 3247
## 5: 473 1273 18126 2854
## 6: 494 1220 16783 2495
## category_brand_sold ty_visits discount
## 1: 2461 105438700 0.0000000
## 2: 2594 105956413 0.2707342
## 3: 2731 106495403 -1.1449666
## 4: 2710 106410299 -0.2891559
## 5: 2570 105864218 0.2341721
## 6: 2608 106013149 -0.1751387
As we can see below, data set has no significant level of increasing variance over time except for several outlying exceptions, thus decomposition should be done in an “additive” way.
After the decomposition, let’s check the KPSS test results for stationary. The test shows that the value is way less than the significance threshold, thus we fail to reject the null hypothesis, which is the stationary of the data.
Also, if we try to decompose the data with a weekly and monthly perspective, we get an error since the dataset has no data that can support at least two periods. Therefore, we can only continue with the daily decomposition.
Furthermore, let’s analyze the ACF and PCF plots.
decomp_4066298 <- decomp(data_4066298, 7)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0066
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
As we can see, the ACF plot shows that there’s an exponential decay. Also, PCF graph shows that lag-1 is significant and starting from lag-2, almost every lag is negative, and lag-3 has the highest significance in the negative domain. There’s no seasonal features observed in these plots.
acf(decomp_4066298, na.action = na.pass)
pacf(decomp_4066298, na.action = na.pass)
According the plots above, we should use an ARIMA model with c(1, 0, 0) to observe the outputs. After that, I’ve tried (2, 0, 0), (3, 0, 0), (4, 0, 0) to see the difference between the models. And after the 4th order, it was not efficient to increase the model complexity. Therefore, I’ve selected my model as (4, 0, 0).
model1 <- arima(decomp_4066298, order = c(1,0,0))
summary(model1)
##
## Call:
## arima(x = decomp_4066298, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.2243 -0.1281
## s.e. 0.0491 16.7156
##
## sigma^2 estimated as 66162: log likelihood = -2738.79, aic = 5483.58
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0142147 257.2201 138.4546 25.66661 173.8286 0.8404375
## ACF1
## Training set 0.05905702
model2 <- arima(decomp_4066298, order = c(2,0,0))
summary(model2)
##
## Call:
## arima(x = decomp_4066298, order = c(2, 0, 0))
##
## Coefficients:
## ar1 ar2 intercept
## 0.2828 -0.2598 -0.036
## s.e. 0.0486 0.0486 12.828
##
## sigma^2 estimated as 61655: log likelihood = -2725, aic = 5458
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.03105885 248.3046 132.356 115.1177 171.4438 0.8034182 -0.0781715
model3 <- arima(decomp_4066298, order = c(3,0,0))
summary(model3)
##
## Call:
## arima(x = decomp_4066298, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.2032 -0.1743 -0.3034 0.2411
## s.e. 0.0480 0.0482 0.0480 9.3799
##
## sigma^2 estimated as 55931: log likelihood = -2706, aic = 5421.99
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.07773755 236.4973 128.9465 225.8142 331.8393 0.7827222
## ACF1
## Training set -0.02655702
model4 <- arima(decomp_4066298, order = c(4,0,0))
summary(model4)
##
## Call:
## arima(x = decomp_4066298, order = c(4, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.1759 -0.1900 -0.2858 -0.0890 0.3208
## s.e. 0.0502 0.0488 0.0488 0.0502 8.5779
##
## sigma^2 estimated as 55483: log likelihood = -2704.43, aic = 5420.87
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.09661878 235.5487 128.6532 109.5407 223.2888 0.780942
## ACF1
## Training set -0.01464148
By incrementing the order of the ARIMA model 1 at each time, we can see the change in the AIC value and after the (4, 0, 0) model, it was not efficient to increase the model complexity. Therefore, I’d like to use (4, 0, 0) in the future tasks.
cor_plot(data_4066298, "Bebek Islak Mendil")
In the project, most of the variables (basket_count, favored_count, category_sold) are not independent. Therefore, using multiple regressors on a model for this data set is not a suitable approach. It will arise multicollinearity problem.
By looking the correlation plot, I’ve selected 3 distinct features to feed my ARIMA models and category_sold was the one that has given the smallest AIC value. Thus, I’ve selected my regressor as category_sold.
The block below tries visit_count as the regressor for the selected model in the previous task.
data_4066298_norm <- data_manip("4066298", normal = T)
model5 <- arima(decomp_4066298, order = c(4,0,0), xreg = cbind(data_4066298_norm$visit_count))
summary(model5)
##
## Call:
## arima(x = decomp_4066298, order = c(4, 0, 0), xreg = cbind(data_4066298_norm$visit_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.4527 0.0385 -0.0926 0.1770 -1.5154
## s.e. 0.0558 0.0578 0.0565 0.0506 22.5026
## cbind(data_4066298_norm$visit_count)
## 235.2609
## s.e. 17.8421
##
## sigma^2 estimated as 36265: log likelihood = -2620.81, aic = 5255.62
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.5359311 190.4334 116.0973 -24.01496 319.7511 0.7047259
## ACF1
## Training set -0.02183769
The block below tries basket_count as the regressor for the selected model in the previous task.
model6 <- arima(decomp_4066298, order = c(4,0,0), xreg = cbind(data_4066298_norm$basket_count))
summary(model6)
##
## Call:
## arima(x = decomp_4066298, order = c(4, 0, 0), xreg = cbind(data_4066298_norm$basket_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.4651 0.0650 -0.0753 0.1770 -1.4071
## s.e. 0.0540 0.0577 0.0562 0.0513 25.6317
## cbind(data_4066298_norm$basket_count)
## 248.3444
## s.e. 16.9054
##
## sigma^2 estimated as 35520: log likelihood = -2616.76, aic = 5247.52
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.6658279 188.467 113.7633 -57.15185 361.952 0.6905579
## ACF1
## Training set -0.02289932
The block below tries category_sold as the regressor for the selected model in the previous task.
model7 <- arima(decomp_4066298, order = c(4,0,0), xreg = cbind(data_4066298_norm$category_sold))
summary(model7)
##
## Call:
## arima(x = decomp_4066298, order = c(4, 0, 0), xreg = cbind(data_4066298_norm$category_sold))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.7826 -0.1097 -0.1751 0.2341 -1.7393
## s.e. 0.0800 0.0655 0.0622 0.0536 30.7476
## cbind(data_4066298_norm$category_sold)
## 314.3258
## s.e. 20.7510
##
## sigma^2 estimated as 27180: log likelihood = -2564.41, aic = 5142.81
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.7215695 164.8625 109.0541 -174.6048 444.4641 0.6619725
## ACF1
## Training set 0.01842316
As we can see, category_sold has given the smallest AIC, thus my predictive model is utilizing category_sold as its external regressor. And the block below creates a model with the given specifications, constructs the predictions and plot them to compare the results.
The plot shows that predictions are way more than the actual values. This may occur due to the lack of seasonality in our data, since the data set has seasonal features, but we did not create a model with a seasonality option. Let’s check the WMAPE value for gaining more insight.
comparison_4066298 <- forecast_func(data = data_4066298$sold_count, ar_order = c(4,0,0), reg_matrix = cbind(data_4066298_norm$category_sold))
comparison_4066298 <- data.table(cbind(data_4066298$event_date[(nrow(data_4066298)-6):nrow(data_4066298)], comparison_4066298))
colnames(comparison_4066298) <- c("event_date", "Actual", "Prediction")
plotting(comparison_4066298,product_name = "Bebek Islak Mendil", col = "",num =2)
It is not a surprise for us to see that much WMAPE score, since our predictions are way off than the actual data. This also indicates that our model has functioned insufficiently and there’s much work to be done in future to improve this model.
accu(comparison_4066298$Actual, comparison_4066298$Prediction)
## n mean sd CV FBias MAPE RMSE MAD MADP
## 1 7 468.1429 135.5242 0.2894933 -1.793841 1.920841 857.1184 839.7741 1.793841
## WMAPE
## 1 1.793841
Let’s examine the data of the “Şarj Edilebilir Diş Fırçası”.
data_32939029 <- data_manip("32939029", normal = F)
head(data_32939029)
## event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 112.9000 74 3113 323 369
## 2: 2020-05-26 115.8495 101 3704 411 451
## 3: 2020-05-27 114.1078 103 3617 398 439
## 4: 2020-05-28 115.1035 84 3428 370 413
## 5: 2020-05-29 126.1038 52 2669 257 309
## 6: 2020-05-30 127.5000 38 2179 184 241
## category_sold category_visits category_basket category_favored
## 1: 1193 1231 48380 4132
## 2: 1351 1419 52014 4647
## 3: 1071 1125 46574 3876
## 4: 927 978 42764 3336
## 5: 810 851 39533 2878
## 6: 842 890 41007 3087
## category_brand_sold ty_visits discount
## 1: 6411 119209111 0.0000000
## 2: 7222 122676139 -2.9495050
## 3: 5785 116532038 1.7417380
## 4: 5046 113372216 -0.9956854
## 5: 4445 110804859 -11.0003938
## 6: 4610 111507042 -1.3961538
As we can see below, data set has no significant level of increasing variance over time, thus decomposition should be done in an “additive” way.
After the decomposition, let’s check the KPSS test results for stationary. The test shows that the value is way less than the significance threshold, thus we fail to reject the null hypothesis, which is the stationary of the data.
Also, if we try to decompose the data with a weekly and monthly perspective, we get an error since the dataset has no data that can support at least two periods. Therefore, we can only continue with the daily decomposition.
Furthermore, let’s analyze the ACF and PCF plots.
decomp_32939029 <- decomp(data_32939029, 7)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0071
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
As we can see, the ACF plot shows that there’s an exponential decay. Also, PCF graph shows that lag-1 is significant and starting from lag-2, almost every lag is negative, and lag-3 has the highest significance in the negative domain. There’s no seasonal features observed in these plots.
acf(decomp_32939029, na.action = na.pass)
pacf(decomp_32939029, na.action = na.pass)
According the plots above, we should use an ARIMA model with c(1, 0, 0) to observe the outputs. After that, I’ve tried (2, 0, 0), (3, 0, 0), (4, 0, 0) to see the difference between the models. And after the 4th order, it was not efficient to make the model more complex, thus I’ve selected (4, 0, 0) as my baseline model.
model1 <- arima(decomp_32939029, order = c(1,0,0))
summary(model1)
##
## Call:
## arima(x = decomp_32939029, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.2424 -0.0382
## s.e. 0.0489 2.3229
##
## sigma^2 estimated as 1219: log likelihood = -1953.98, aic = 3913.96
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0065658 34.91608 24.04219 145.0193 165.2094 0.8454933
## ACF1
## Training set 0.08650671
model2 <- arima(decomp_32939029, order = c(2,0,0))
summary(model2)
##
## Call:
## arima(x = decomp_32939029, order = c(2, 0, 0))
##
## Coefficients:
## ar1 ar2 intercept
## 0.3283 -0.3536 -0.0156
## s.e. 0.0471 0.0471 1.6074
##
## sigma^2 estimated as 1065: log likelihood = -1927.63, aic = 3863.27
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.004271236 32.64089 22.67419 86.33321 245.7932 0.7973849
## ACF1
## Training set -0.08688788
model3 <- arima(decomp_32939029, order = c(3,0,0))
summary(model3)
##
## Call:
## arima(x = decomp_32939029, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.2403 -0.2727 -0.2472 0.0123
## s.e. 0.0488 0.0483 0.0488 1.2490
##
## sigma^2 estimated as 999.7: log likelihood = -1915.21, aic = 3840.42
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.005329131 31.61751 21.55049 44.657 193.8696 0.7578676
## ACF1
## Training set -0.02782988
model4 <- arima(decomp_32939029, order = c(4,0,0))
summary(model4)
##
## Call:
## arima(x = decomp_32939029, order = c(4, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.2119 -0.3040 -0.2195 -0.1144 0.0182
## s.e. 0.0501 0.0499 0.0500 0.0501 1.1140
##
## sigma^2 estimated as 986.4: log likelihood = -1912.62, aic = 3837.23
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.001371699 31.40738 21.35659 32.94868 193.1814 0.7510488
## ACF1
## Training set -0.0279598
By incrementing the order of the ARIMA model 1 at each time, we can see the change in the AIC value and after the (4, 0, 0) model, it was not efficient to increase the model complexity. Therefore, I’d like to use (4, 0, 0) in the future tasks.
cor_plot(data_32939029, "Dik Süpürge")
In the project, most of the variables (basket_count, favored_count, category_sold) are not independent. Therefore, using multiple regressors on a model for this data set is not a suitable approach. It will arise multicollinearity problem.
By looking the correlation plot, I’ve selected 2 distinct features to feed my ARIMA models and category_sold was the one that has given the smallest AIC value. Thus, I’ve selected my regressor as category_sold.
The block below tries visit_count as the regressor for the selected model in the previous task.
data_32939029_norm <- data_manip("32939029", normal = T)
model5 <- arima(decomp_32939029, order = c(4,0,0), xreg = cbind(data_32939029_norm$visit_count))
summary(model5)
##
## Call:
## arima(x = decomp_32939029, order = c(4, 0, 0), xreg = cbind(data_32939029_norm$visit_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.2194 -0.2881 -0.2107 -0.0784 -0.0316
## s.e. 0.0525 0.0515 0.0512 0.0546 1.1309
## cbind(data_32939029_norm$visit_count)
## 6.3921
## s.e. 1.3942
##
## sigma^2 estimated as 922.2: log likelihood = -1899.34, aic = 3812.68
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.007547913 30.36736 20.86472 86.85503 194.9023 0.7337513
## ACF1
## Training set -0.01504189
The block below tries basket_count as the regressor for the selected model in the previous task.
model6 <- arima(decomp_32939029, order = c(4,0,0), xreg = cbind(data_32939029_norm$basket_count))
summary(model6)
##
## Call:
## arima(x = decomp_32939029, order = c(4, 0, 0), xreg = cbind(data_32939029_norm$basket_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.2280 -0.2830 -0.2069 -0.0657 -0.0329
## s.e. 0.0541 0.0522 0.0518 0.0568 1.1481
## cbind(data_32939029_norm$basket_count)
## 7.2284
## s.e. 1.5119
##
## sigma^2 estimated as 908.8: log likelihood = -1896.45, aic = 3806.9
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.007727977 30.1456 20.77369 93.52427 205.3973 0.7305498
## ACF1
## Training set -0.01144861
As we can see, basket_count has given the smallest AIC, thus my predictive model is utilizing basket_count as its external regressor. And the block below creates a model with the given specifications, constructs the predictions and plot them to compare the results.
The plot shows that predictions are way more than the actual values for quite a time, buy they get closer at the end of the predictions. This may occur due to the lack of seasonality in our data, since the data set has seasonal features, but we did not create a model with a seasonality option. Let’s check the WMAPE value for gaining more insight.
comparison_32939029 <- forecast_func(data = data_32939029$sold_count, ar_order = c(4,0,0), reg_matrix = cbind(data_32939029_norm$basket_count))
comparison_32939029 <- data.table(cbind(data_32939029$event_date[(nrow(data_32939029)-6):nrow(data_32939029)], comparison_32939029))
colnames(comparison_32939029) <- c("event_date", "Actual", "Prediction")
plotting(comparison_32939029,product_name = "Şarjlı Diş Fırçası", col = "",num =2)
It is not a surprise for us to see that much WMAPE score, since our predictions are way off than the actual data. This also indicates that our model has functioned insufficiently and there’s much work to be done in future to improve this model.
accu(comparison_32939029$Actual, comparison_32939029$Prediction)
## n mean sd CV FBias MAPE RMSE MAD MADP
## 1 7 60.14286 31.16851 0.5182413 -2.112039 2.316722 133.719 127.0241 2.112039
## WMAPE
## 1 2.112039
The data is imported and necessary manipulations are made.
Let us have a look at the first 6 rows of each column.
data_48740784 <- data_manip("48740784", normal = F)
head(data_48740784)
## event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 833.32 0 18 0 1
## 2: 2020-05-26 833.32 0 18 0 1
## 3: 2020-05-27 833.32 0 18 0 1
## 4: 2020-05-28 833.32 0 18 0 1
## 5: 2020-05-29 833.32 0 18 0 1
## 6: 2020-05-30 833.32 0 18 0 1
## category_sold category_visits category_basket category_favored
## 1: 15 1002 248050 6866
## 2: 14 1091 257980 7201
## 3: 16 1036 232193 6331
## 4: 17 846 213727 5708
## 5: 21 932 207028 5482
## 6: 15 778 200893 5275
## category_brand_sold ty_visits discount
## 1: 33713 116610408 0
## 2: 32590 116586887 0
## 3: 34837 116633930 0
## 4: 35960 116657451 0
## 5: 40453 116751537 0
## 6: 33713 116610408 0
The first step will be to apply daily decomposition. From the project report, it is obvious that the sales variance does not rise with time. As a result, the decomposition method should be additive.
decomp_48740784 <- decomp(data_48740784, 7)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0085
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
The results from unit root test belongs to the random component of decomposition. Since its significance value is lower than the significance levels, the null hypothesis is failed to be rejected. The null hypothesis is that the data is stationary. So, the data is stationary.
As you can observe, for some seasons such as Summer of 2020, January of 2021 or Spring of 2021 there are not any sales at all, which leads us there may be a kind of a seasonality or there may be a different effects such as price. To understand what affects our daily sales, it is necessary that to control over correlations between different properties.
Because the data does not cover sales from the previous two years, weekly and monthly decompositions are not possible. There are not enough time periods for different kind of composition. As a result, an error message is generated by this type of decomposition.
Now, the autocorrelation and partial autocorrelation graphs will be analyzed.
acf(decomp_48740784, na.action = na.pass)
pacf(decomp_48740784, na.action = na.pass)
In this graph, you can observe that lag-1 has a substantial autocorrelation and exponential decline. After lag-1, the partial autocorrelation graph shows a sharp reduction, while the following lags exhibit generally negative autocorrelation. In addition, the ACF plot resembles a sinus function. These graphics do not show seasonal autocorrelation.
According to the plots above, ARIMA (1,0,0) can be applied as a baseline model.
model1 <- arima(decomp_48740784, order = c(1,0,0))
summary(model1)
##
## Call:
## arima(x = decomp_48740784, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.0431 0.0013
## s.e. 0.0503 0.1404
##
## sigma^2 estimated as 7.097: log likelihood = -942.72, aic = 1891.43
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 3.129096e-05 2.663994 0.8908194 6.388256 182.2824 0.7377994
## ACF1
## Training set 0.01344526
Since lag-4 variable has a significant partial autocorrelation after the biggest one in the lag-3, ARIMA (0,0,4) can also be tried.
model2 <- arima(decomp_48740784, order = c(0,0,4))
summary(model2)
##
## Call:
## arima(x = decomp_48740784, order = c(0, 0, 4))
##
## Coefficients:
## ma1 ma2 ma3 ma4 intercept
## -0.6873 -0.6180 -0.1070 0.4123 0e+00
## s.e. 0.0624 0.0857 0.0896 0.0642 6e-04
##
## sigma^2 estimated as 4.267: log likelihood = -846.62, aic = 1705.24
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.009097396 2.065695 0.8381743 810.9812 1023.193 0.6941974
## ACF1
## Training set 0.152897
Lastly, combining these two approaches may lead us to the better one. The ARIMA model (1,0,4) is below.
model3 <- arima(decomp_48740784, order = c(1,0,4))
summary(model3)
##
## Call:
## arima(x = decomp_48740784, order = c(1, 0, 4))
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 intercept
## 0.5166 -1.0891 -0.3018 0.0487 0.3422 0e+00
## s.e. 0.0679 0.0639 0.0900 0.1063 0.0580 4e-04
##
## sigma^2 estimated as 3.832: log likelihood = -826.04, aic = 1666.08
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01174441 1.957431 0.7599667 643.0343 731.5331 0.6294238
## ACF1
## Training set 0.03920278
The lowest AIC value belongs to ARIMA (1,0,4) model. Therefore, this model can be chosen as the last model.
Most variables in the project are not independent (basket count, favored count, category sold). As a result, utilizing numerous regressors in the model isn’t recommended. There will be a multicollinearity issue.
Now, the correlation matrix will be observed.
cor_plot(data_48740784, "Mont")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
The number of sales have high correlation with basket_count, and visit_count. Since the features are scaled differently, they should be normalized.
decomp_48740784_norm <- data_manip("48740784", normal = T)
model4 <- arima(decomp_48740784, order = c(1,0,4), xreg = cbind(decomp_48740784_norm$basket_count))
summary(model4)
##
## Call:
## arima(x = decomp_48740784, order = c(1, 0, 4), xreg = cbind(decomp_48740784_norm$basket_count))
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 intercept
## 0.9712 -0.5373 -0.0453 -0.1663 0.0132 0.0407
## s.e. 0.0147 0.0557 0.0632 0.0640 0.0594 0.7010
## cbind(decomp_48740784_norm$basket_count)
## 3.0028
## s.e. 0.1333
##
## sigma^2 estimated as 2.641: log likelihood = -749.01, aic = 1514.02
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01247259 1.625112 0.881014 -985.4826 1492.602 0.7296783
## ACF1
## Training set 0.001403451
It can be said that AIC value decreased, which is preferred.
Now, visit_count will be added, instead of basket_count
model5 <- arima(decomp_48740784, order = c(1,0,4), xreg = cbind(decomp_48740784_norm$visit_count))
summary(model5)
##
## Call:
## arima(x = decomp_48740784, order = c(1, 0, 4), xreg = cbind(decomp_48740784_norm$visit_count))
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 intercept
## 0.9701 -0.5179 -0.0541 -0.1672 0.0134 0.0256
## s.e. 0.0153 0.0557 0.0619 0.0628 0.0593 0.7031
## cbind(decomp_48740784_norm$visit_count)
## 3.0254
## s.e. 0.1323
##
## sigma^2 estimated as 2.652: log likelihood = -749.86, aic = 1515.72
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01222348 1.628625 0.8999529 -1011.278 1529.505 0.745364
## ACF1
## Training set 0.001303653
The model with basket_count has smaller AIC value, compared to the model with visit_count. Therefore, it will be chosen as final model.
Now, forecasts for the last week will be made and plotted to better analysis.
comparison_48740784 <- forecast_func(data = data_48740784$sold_count, ar_order = c(1,0,4), reg_matrix = cbind(data_48740784$visit_count))
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
comparison_48740784 <- data.table(cbind(data_48740784$event_date[(nrow(data_48740784)-6):nrow(data_48740784)], comparison_48740784))
colnames(comparison_48740784) <- c("event_date", "Actual", "Prediction")
plotting(comparison_48740784,product_name = "Mont", col = "",num =2)
For a prediction model that predicts different numbers between 0 and 6, accuracy of our model seems fair enough, which leads us to calculations of the other products.
The data is imported and the appropriate adjustments are done.
Let’s examine the top six rows of each column.
data_73318567 <- data_manip("73318567", normal = F)
head(data_48740784)
## event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 833.32 0 18 0 1
## 2: 2020-05-26 833.32 0 18 0 1
## 3: 2020-05-27 833.32 0 18 0 1
## 4: 2020-05-28 833.32 0 18 0 1
## 5: 2020-05-29 833.32 0 18 0 1
## 6: 2020-05-30 833.32 0 18 0 1
## category_sold category_visits category_basket category_favored
## 1: 15 1002 248050 6866
## 2: 14 1091 257980 7201
## 3: 16 1036 232193 6331
## 4: 17 846 213727 5708
## 5: 21 932 207028 5482
## 6: 15 778 200893 5275
## category_brand_sold ty_visits discount
## 1: 33713 116610408 0
## 2: 32590 116586887 0
## 3: 34837 116633930 0
## 4: 35960 116657451 0
## 5: 40453 116751537 0
## 6: 33713 116610408 0
The first stage will be to decompose on a daily basis. It is clear from the project report that the sales variation does not increase over time. As a result, the method of decomposition must be additive.
decomp_73318567 <- decomp(data_73318567, 7)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0104
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
The random component of decomposition includes the results of the unit root test. The null hypothesis is not rejected since its significance value is less than the significance thresholds. The data is stationary, according to the null hypothesis.
As you can observe from the plot, the sales over time are very different from each other. There are not any sales nearly until February 2021. The reason behind that could be the bikini top is not for the sale in that time frame. After February 2021 the sales are increased and decreased instantly, and again there are no sales. The reason behind that also could be the same. Later, when the summer hits, the sales increased very quick which may be occurred because of the change in price or increased stocks in our product. Weekly and monthly decomposition cannot be applied because the data do not include sales happened in 2 years. There are less than two periods. Therefore, this type of decomposition brings an error message.
The graphs of autocorrelation and partial autocorrelation will now be examined.
acf(decomp_73318567, na.action = na.pass)
pacf(decomp_73318567, na.action = na.pass)
In this graph, you can observe that lag-1 has a substantial autocorrelation and exponential decline. After lag-1, the partial autocorrelation graph shows a sharp decline, while the following lags have negative autocorrelation. These graphics do not show seasonal autocorrelation.
According to the plots above, ARIMA (1,0,0) can be applied as a baseline model.
model1 <- arima(decomp_73318567, order = c(1,0,0))
summary(model1)
##
## Call:
## arima(x = decomp_73318567, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.1178 0.0388
## s.e. 0.0500 0.5990
##
## sigma^2 estimated as 109.8: log likelihood = -1480.96, aic = 2967.92
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.0009533427 10.4792 4.583608 136.85 138.4337 0.7408935 0.023321
Since lag-5 variable has a significant autocorrelation, ARIMA (5,0,0) can also be tried.
model2 <- arima(decomp_73318567, order = c(5,0,0))
summary(model2)
##
## Call:
## arima(x = decomp_73318567, order = c(5, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 intercept
## 0.0389 -0.2421 -0.1975 -0.1325 -0.2993 0.0185
## s.e. 0.0481 0.0476 0.0482 0.0478 0.0483 0.2644
##
## sigma^2 estimated as 91.43: log likelihood = -1445.33, aic = 2904.65
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.001477023 9.562058 3.965705 166.4667 185.1378 0.6410158
## ACF1
## Training set -0.03199084
Lastly, lag-5 variable has also significant autocorrelation, and it is the last variable being significant. The model is below.
model3 <- arima(decomp_73318567, order = c(0,0,5))
summary(model3)
##
## Call:
## arima(x = decomp_73318567, order = c(0, 0, 5))
##
## Coefficients:
## ma1 ma2 ma3 ma4 ma5 intercept
## -0.0802 -0.3475 -0.2691 -0.1259 -0.1773 0.0017
## s.e. 0.0493 0.0548 0.0581 0.0481 0.0619 0.0120
##
## sigma^2 estimated as 84.12: log likelihood = -1430.96, aic = 2875.92
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.05333884 9.171719 3.814444 122.369 148.3585 0.6165659
## ACF1
## Training set -0.004057043
Lastly, combining these two approaches may lead us to the better one. The ARIMA model (5,0,5) is below.
model4 <- arima(decomp_73318567, order = c(5,0,5))
summary(model4)
##
## Call:
## arima(x = decomp_73318567, order = c(5, 0, 5))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.2012 -0.3961 -0.4063 0.5749 -0.3816 -0.2994 0.0958 0.1195
## s.e. 0.1305 0.0628 0.0731 0.0739 0.0900 0.1421 0.0709 0.0661
## ma4 ma5 intercept
## -0.9243 0.0083 0.0017
## s.e. 0.0648 0.1369 0.0094
##
## sigma^2 estimated as 78.1: log likelihood = -1419.5, aic = 2862.99
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.06261232 8.837703 3.952581 178.4168 246.9922 0.6388944
## ACF1
## Training set -0.0002326938
The lowest AIC value belongs to ARIMA (5,0,5) model. Therefore, this model can be chosen as the last model.
Most variables in the project are not independent (basket count, favored count, category sold). As a result, utilizing numerous regressors in the model isn’t recommended because will be a multicollinearity issue.
Now, the correlation matrix will be observed.
cor_plot(data_73318567, "Bikini Üstü")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
The number of sales have high correlation with basket_count, and visit_count. Since the features are scaled differently, they should be normalized.
decomp_73318567_norm <- data_manip("48740784", normal = T)
model4 <- arima(decomp_73318567, order = c(5,0,5), xreg = cbind(decomp_73318567_norm$basket_count))
summary(model4)
##
## Call:
## arima(x = decomp_73318567, order = c(5, 0, 5), xreg = cbind(decomp_73318567_norm$basket_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.1986 -0.3966 -0.4066 0.5734 -0.3798 -0.2967 0.0956 0.1192
## s.e. 0.1319 0.0647 0.0755 0.0747 0.0903 0.1434 0.0733 0.0686
## ma4 ma5 intercept cbind(decomp_73318567_norm$basket_count)
## -0.9232 0.0052 0.0011 0.0097
## s.e. 0.0664 0.1372 0.0100 0.0568
##
## sigma^2 estimated as 78.1: log likelihood = -1419.48, aic = 2864.96
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.06162157 8.837562 3.957561 180.0684 256.0564 0.6396993
## ACF1
## Training set -0.0003103404
It can be said that AIC value is approximately same.
Now, visit_count will be added, instead of basket_count
model5 <- arima(decomp_73318567, order = c(5,0,5), xreg = cbind(decomp_73318567_norm$visit_count))
summary(model5)
##
## Call:
## arima(x = decomp_73318567, order = c(5, 0, 5), xreg = cbind(decomp_73318567_norm$visit_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.2012 -0.3959 -0.4065 0.5751 -0.3817 -0.2994 0.0957 0.1197
## s.e. 0.1304 0.0625 0.0727 0.0738 0.0900 0.1420 0.0705 0.0657
## ma4 ma5 intercept cbind(decomp_73318567_norm$visit_count)
## -0.9244 0.0085 0.0013 0.0054
## s.e. 0.0645 0.1369 0.0101 0.0561
##
## sigma^2 estimated as 78.1: log likelihood = -1419.49, aic = 2864.98
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.06278932 8.837569 3.95562 179.6783 251.8685 0.6393856
## ACF1
## Training set -0.0002240329
It has the same AIC value, compared to the model with basket_count Therefore, selecting one of them is enough.
Now, lastly, category_visits will be used as a regressor.
model6 <- arima(decomp_73318567, order = c(5,0,5), xreg = cbind(decomp_73318567_norm$category_visits))
summary(model6)
##
## Call:
## arima(x = decomp_73318567, order = c(5, 0, 5), xreg = cbind(decomp_73318567_norm$category_visits))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2 ma3
## 0.2040 -0.3969 -0.4069 0.5751 -0.3849 -0.3037 0.0967 0.1196
## s.e. 0.1302 0.0625 0.0728 0.0738 0.0896 0.1421 0.0706 0.0658
## ma4 ma5 intercept cbind(decomp_73318567_norm$category_visits)
## -0.9253 0.0127 0.0087 -0.0305
## s.e. 0.0646 0.1370 0.0132 0.0401
##
## sigma^2 estimated as 77.99: log likelihood = -1419.21, aic = 2864.42
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1511481 8.831039 3.997122 199.7451 274.4271 0.6460939
## ACF1
## Training set -0.0004398796
This model has a lower AIC value compared to the previous models. Therefore, model with category_visits will be used as a final model.
Now, forecasts for the last week will be made.
comparison_73318567 <- forecast_func(data = data_73318567$sold_count, ar_order = c(5,0,5), reg_matrix = cbind(data_73318567$category_visits))
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
## Warning in cbind(intercept = rep(1, n.ahead), newxreg): number of rows of result
## is not a multiple of vector length (arg 1)
comparison_73318567 <- data.table(cbind(data_73318567$event_date[(nrow(data_73318567)-6):nrow(data_73318567)], comparison_73318567))
colnames(comparison_73318567) <- c("event_date", "Actual", "Prediction")
plotting(comparison_73318567,product_name = "Bikini Üstü(1)", col = "",num =2)
The forecasts seems not good enough but it is valid for a prediction model that uses only AR and MA approaches. To provide more improvement for the model, it would be wise that using seasonalities because it is a seasonal product or using SARIMA or GAM models.
The data is imported and necessary manipulations are made.
Let us have a look at the first 6 rows of each column.
data_31515569 <- data_manip("31515569", normal = F)
head(data_48740784)
## event_date price sold_count visit_count basket_count favored_count
## 1: 2020-05-25 833.32 0 18 0 1
## 2: 2020-05-26 833.32 0 18 0 1
## 3: 2020-05-27 833.32 0 18 0 1
## 4: 2020-05-28 833.32 0 18 0 1
## 5: 2020-05-29 833.32 0 18 0 1
## 6: 2020-05-30 833.32 0 18 0 1
## category_sold category_visits category_basket category_favored
## 1: 15 1002 248050 6866
## 2: 14 1091 257980 7201
## 3: 16 1036 232193 6331
## 4: 17 846 213727 5708
## 5: 21 932 207028 5482
## 6: 15 778 200893 5275
## category_brand_sold ty_visits discount
## 1: 33713 116610408 0
## 2: 32590 116586887 0
## 3: 34837 116633930 0
## 4: 35960 116657451 0
## 5: 40453 116751537 0
## 6: 33713 116610408 0
The first step will be a daily decomposition. The project report clearly shows that the sales variety does not rise over time. As a result, the breakdown process must be additive.
decomp_31515569 <- decomp(data_31515569, 7)
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0074
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
The random component of decomposition includes the results of the unit root test. The null hypothesis is not rejected since its significance value is less than the significance thresholds. The data is stationary, according to the null hypothesis. As a result, the data is static and stationary.
From the plot it can be observed that there are much variation in the series. The sales follows a general trend over time. However there are various instant increases in the sales like the one in the “Black Friday”. The reason behind that could be a decrease in price. The following calculations will helps us to reach whether this assumption is correct. In order to become a good prediction model, the model needs some regressors to predict next sales.
Weekly and monthly decompositions are not possible because the data does not include sales from the prior two years. There are insufficient time periods for various types of composing. As a result, this type of decomposition generates an error message.
Now, the autocorrelation and partial autocorrelation graphs will be analyzed.
acf(decomp_31515569, na.action = na.pass)
pacf(decomp_31515569, na.action = na.pass)
You can see that lag-1 has a lot of autocorrelation and an exponential drop in this graph. The partial autocorrelation graph reveals a rapid decline after lag-1, with generally negative autocorrelation for the succeeding lags. Furthermore, the ACF plot looks like a sinus function. Seasonal autocorrelation is not seen in these graphs.
According to the plots above, ARIMA (1,0,0) can be applied as a baseline model.
model1 <- arima(decomp_31515569, order = c(1,0,0))
summary(model1)
##
## Call:
## arima(x = decomp_31515569, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.3314 -0.6102
## s.e. 0.0476 40.3561
##
## sigma^2 estimated as 286857: log likelihood = -3027.07, aic = 6060.13
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.4661154 535.5908 290.1284 129.6814 163.4625 0.9149846 0.06772813
Since lag-4 variable has a significant autocorrelation, ARIMA (4,0,0) can also be tried.
model2 <- arima(decomp_31515569, order = c(4,0,0))
summary(model2)
##
## Call:
## arima(x = decomp_31515569, order = c(4, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.2672 -0.1142 -0.1971 -0.2600 -0.2653
## s.e. 0.0487 0.0496 0.0496 0.0487 18.8041
##
## sigma^2 estimated as 234876: log likelihood = -2988.09, aic = 5988.18
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.8611098 484.64 260.1447 77.37513 146.6412 0.8204243 -0.03183378
Lastly, lag-4 variable has also significant partial autocorrelation. The model is below.
model3 <- arima(decomp_31515569, order = c(0,0,4))
summary(model3)
##
## Call:
## arima(x = decomp_31515569, order = c(0, 0, 4))
##
## Coefficients:
## ma1 ma2 ma3 ma4 intercept
## 0.0452 -0.3004 -0.4003 -0.3444 0.0502
## s.e. 0.0464 0.0459 0.0433 0.0514 0.6222
##
## sigma^2 estimated as 204227: log likelihood = -2962.74, aic = 5937.48
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.676329 451.9153 237.7718 64.68362 99.17051 0.7498663 0.02782415
Lastly, combining these two approaches may lead us to the better one. The ARIMA model (4,0,4) is below.
model4 <- arima(decomp_73318567, order = c(4,0,4))
summary(model4)
##
## Call:
## arima(x = decomp_73318567, order = c(4, 0, 4))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 ma4
## -0.0891 -0.0194 0.5088 -0.4297 0.0038 -0.3637 -0.9041 0.2639
## s.e. 0.1543 0.0903 0.0502 0.0889 0.1650 0.0532 0.0563 0.1508
## intercept
## 0.0013
## s.e. 0.0093
##
## sigma^2 estimated as 82.38: log likelihood = -1427.6, aic = 2875.19
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.05929689 9.076216 3.867446 147.8099 208.1097 0.6251332
## ACF1
## Training set -0.0008030574
As you can observe, it is clear that AIC becomes far better when these two approaches are combined. It is dropped to 2875 from 6000.
Hence, the last model (4,0,4) is chosen to be improved with further analysis such as adding new regressors.
The majority of the project’s variables are not independent (basket count, favored count, category sold). As a result, including several regressors in the model is not suggested due to the risk of multicollinearity.
Now, the correlation matrix will be observed.
cor_plot(data_31515569, "Tayt")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
The number of sales have high correlation with category_sold, basket_count, and category_visit. Since the features are scaled differently, they should be normalized.
decomp_31515569_norm <- data_manip("48740784", normal = T)
model4 <- arima(decomp_31515569, order = c(4,0,4), xreg = cbind(decomp_31515569_norm$category_sold))
summary(model4)
##
## Call:
## arima(x = decomp_31515569, order = c(4, 0, 4), xreg = cbind(decomp_31515569_norm$category_sold))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 ma4
## -0.1003 0.3711 0.2297 -0.7202 0.0846 -0.7666 -0.7404 0.4227
## s.e. 0.0658 0.0673 0.0385 0.0472 0.1016 0.0619 0.0590 0.1023
## intercept cbind(decomp_31515569_norm$category_sold)
## -0.2676 2.2201
## s.e. 0.4016 1.7600
##
## sigma^2 estimated as 182323: log likelihood = -2941.43, aic = 5904.85
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 3.328515 426.9928 241.4545 89.15942 148.6931 0.7614808 0.01683631
It can be said that AIC value increases, which is not preferred.
Now, basket_count will be added, instead of category_sold.
model5 <- arima(decomp_31515569, order = c(4,0,4), xreg = cbind(decomp_31515569_norm$basket_count))
summary(model5)
##
## Call:
## arima(x = decomp_31515569, order = c(4, 0, 4), xreg = cbind(decomp_31515569_norm$basket_count))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 ma4
## 0.0675 0.2203 0.1198 -0.5543 -0.0758 -0.5874 -0.5742 0.2375
## s.e. NaN 0.1965 NaN NaN NaN 0.2170 NaN 0.0599
## intercept cbind(decomp_31515569_norm$basket_count)
## -0.0345 1.2938
## s.e. 0.3599 2.0245
##
## sigma^2 estimated as 184615: log likelihood = -2943.6, aic = 5909.2
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.3102511 429.668 237.2994 76.64672 136.5645 0.7483766 0.00654275
It has higher AIC value, compared to the model with category_sold. Therefore, it will not be chosen as final model.
Now, lastly, category_visits will be used as a regressor.
model6 <- arima(decomp_31515569, order = c(4,0,4), xreg = cbind(decomp_31515569_norm$category_visits))
summary(model6)
##
## Call:
## arima(x = decomp_31515569, order = c(4, 0, 4), xreg = cbind(decomp_31515569_norm$category_visits))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 ma3 ma4
## 0.3277 1.0349 -0.2456 -0.1330 0.5252 -0.4885 -0.4190 -0.3137
## s.e. 0.1597 0.1553 0.1557 0.1317 0.1519 0.0989 0.1347 0.0633
## intercept cbind(decomp_31515569_norm$category_visits)
## 61.9259 936.1756
## s.e. 314.4471 53.8473
##
## sigma^2 estimated as 138074: log likelihood = -2884.3, aic = 5790.6
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1.133807 371.5835 233.2479 107.1924 240.4663 0.7355992
## ACF1
## Training set -0.0006865672
This model has a lower AIC value compared to the previous model, but the lowest AIC value belongs to the first model, with the number of sold products in the same category.
However, the model without any regressots has the best AIC value so it is preffered over other three. If there is a obligation as usage of any, model with category_sold could be used as a final model.
Now, forecasts for the last week will be made.
comparison_31515569 <- forecast_func(data = data_31515569$sold_count, ar_order = c(4,0,4), reg_matrix = cbind(data_31515569$category_sold))
comparison_31515569 <- data.table(cbind(data_31515569$event_date[(nrow(data_31515569)-6):nrow(data_31515569)], comparison_31515569))
colnames(comparison_31515569) <- c("event_date", "Actual", "Prediction")
plotting(comparison_31515569,product_name = "Tayt", col = "",num =2)
The model are not very strong. However, usage of only one regressor and ARIMA model is really restrictive. As a result, the model is suitable.